In [2]:
#Import libraries
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
pd.set_option('display.max_columns', None)
import plotly.express as px #for visualization
import matplotlib.pyplot as plt #for visualization
#Read the dataset
data_df = pd.read_csv("churn.csv")
#Get overview of the data
def dataoveriew(df, message):
print(f'{message}:n')
print('Number of rows: ', df.shape[0])
print("nNumber of features:", df.shape[1])
print("nData Features:")
print(df.columns.tolist())
print("nMissing values:", df.isnull().sum().values.sum())
print("nUnique values:")
print(df.nunique())
dataoveriew(data_df, 'Overview of the dataset')
Overview of the dataset:n Number of rows: 7043 nNumber of features: 21 nData Features: ['customerID', 'gender', 'SeniorCitizen', 'Partner', 'Dependents', 'tenure', 'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod', 'MonthlyCharges', 'TotalCharges', 'Churn'] nMissing values: 0 nUnique values: customerID 7043 gender 2 SeniorCitizen 2 Partner 2 Dependents 2 tenure 73 PhoneService 2 MultipleLines 3 InternetService 3 OnlineSecurity 3 OnlineBackup 3 DeviceProtection 3 TechSupport 3 StreamingTV 3 StreamingMovies 3 Contract 3 PaperlessBilling 2 PaymentMethod 4 MonthlyCharges 1585 TotalCharges 6531 Churn 2 dtype: int64
In [3]:
import plotly.express as px
# Count the values of the 'Churn' column
target_instance = data_df["Churn"].value_counts().reset_index()
# Rename the columns for easier reference
target_instance.columns = ['Category', 'Count']
# Plot the pie chart
fig = px.pie(
target_instance,
values='Count',
names='Category',
color_discrete_sequence=["blue", "pink"],
title='Distribution of Churn'
)
# Show the pie chart
fig.show()
In [4]:
#Defining bar chart function
def bar(feature, df=data_df ):
#Groupby the categorical feature
temp_df = df.groupby([feature, 'Churn']).size().reset_index()
temp_df = temp_df.rename(columns={0:'Count'})
#Calculate the value counts of each distribution and it's corresponding Percentages
value_counts_df = df[feature].value_counts().to_frame().reset_index()
categories = [cat[1][0] for cat in value_counts_df.iterrows()]
#Calculate the value counts of each distribution and it's corresponding Percentages
num_list = [num[1][1] for num in value_counts_df.iterrows()]
div_list = [element / sum(num_list) for element in num_list]
percentage = [round(element * 100,1) for element in div_list]
#Defining string formatting for graph annotation
#Numeric section
def num_format(list_instance):
formatted_str = ''
for index,num in enumerate(list_instance):
if index < len(list_instance)-2:
formatted_str=formatted_str+f'{num}%, ' #append to empty string(formatted_str)
elif index == len(list_instance)-2:
formatted_str=formatted_str+f'{num}% & '
else:
formatted_str=formatted_str+f'{num}%'
return formatted_str
#Categorical section
def str_format(list_instance):
formatted_str = ''
for index, cat in enumerate(list_instance):
if index < len(list_instance)-2:
formatted_str=formatted_str+f'{cat}, '
elif index == len(list_instance)-2:
formatted_str=formatted_str+f'{cat} & '
else:
formatted_str=formatted_str+f'{cat}'
return formatted_str
#Running the formatting functions
num_str = num_format(percentage)
cat_str = str_format(categories)
#Setting graph framework
fig = px.bar(temp_df, x=feature, y='Count', color='Churn', title=f'Churn rate by {feature}', barmode="group", color_discrete_sequence=["blue", "pink"])
fig.add_annotation(
text=f'Value count of distribution of {cat_str} are<br>{num_str} percentage respectively.',
align='left',
showarrow=False,
xref='paper',
yref='paper',
x=1.4,
y=1.3,
bordercolor='black',
borderwidth=1)
fig.update_layout(
# margin space for the annotations on the right
margin=dict(r=400),
)
return fig.show()
In [5]:
#Gender feature plot
bar('gender')
#SeniorCitizen feature plot
data_df.loc[data_df.SeniorCitizen==0,'SeniorCitizen'] = "No" #convert 0 to No in all data instances
data_df.loc[data_df.SeniorCitizen==1,'SeniorCitizen'] = "Yes" #convert 1 to Yes in all data instances
bar('SeniorCitizen')
#Partner feature plot
bar('Partner')
#Dependents feature plot
bar('Dependents')
C:\Users\ASUS\AppData\Local\Temp\ipykernel_3156\805518032.py:8: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]` C:\Users\ASUS\AppData\Local\Temp\ipykernel_3156\805518032.py:10: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
C:\Users\ASUS\AppData\Local\Temp\ipykernel_3156\3807008499.py:4: FutureWarning: Setting an item of incompatible dtype is deprecated and will raise an error in a future version of pandas. Value 'No' has dtype incompatible with int64, please explicitly cast to a compatible dtype first. C:\Users\ASUS\AppData\Local\Temp\ipykernel_3156\805518032.py:8: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]` C:\Users\ASUS\AppData\Local\Temp\ipykernel_3156\805518032.py:10: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
C:\Users\ASUS\AppData\Local\Temp\ipykernel_3156\805518032.py:8: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]` C:\Users\ASUS\AppData\Local\Temp\ipykernel_3156\805518032.py:10: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
C:\Users\ASUS\AppData\Local\Temp\ipykernel_3156\805518032.py:8: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]` C:\Users\ASUS\AppData\Local\Temp\ipykernel_3156\805518032.py:10: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
In [6]:
bar('PhoneService')
bar('MultipleLines')
bar('InternetService')
bar('OnlineSecurity')
bar('OnlineBackup')
bar('DeviceProtection')
bar('TechSupport')
bar('StreamingTV')
bar('StreamingMovies')
C:\Users\ASUS\AppData\Local\Temp\ipykernel_3156\805518032.py:8: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]` C:\Users\ASUS\AppData\Local\Temp\ipykernel_3156\805518032.py:10: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
C:\Users\ASUS\AppData\Local\Temp\ipykernel_3156\805518032.py:8: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]` C:\Users\ASUS\AppData\Local\Temp\ipykernel_3156\805518032.py:10: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
C:\Users\ASUS\AppData\Local\Temp\ipykernel_3156\805518032.py:8: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]` C:\Users\ASUS\AppData\Local\Temp\ipykernel_3156\805518032.py:10: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
C:\Users\ASUS\AppData\Local\Temp\ipykernel_3156\805518032.py:8: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]` C:\Users\ASUS\AppData\Local\Temp\ipykernel_3156\805518032.py:10: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
C:\Users\ASUS\AppData\Local\Temp\ipykernel_3156\805518032.py:8: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]` C:\Users\ASUS\AppData\Local\Temp\ipykernel_3156\805518032.py:10: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
C:\Users\ASUS\AppData\Local\Temp\ipykernel_3156\805518032.py:8: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]` C:\Users\ASUS\AppData\Local\Temp\ipykernel_3156\805518032.py:10: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
C:\Users\ASUS\AppData\Local\Temp\ipykernel_3156\805518032.py:8: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]` C:\Users\ASUS\AppData\Local\Temp\ipykernel_3156\805518032.py:10: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
C:\Users\ASUS\AppData\Local\Temp\ipykernel_3156\805518032.py:8: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]` C:\Users\ASUS\AppData\Local\Temp\ipykernel_3156\805518032.py:10: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
C:\Users\ASUS\AppData\Local\Temp\ipykernel_3156\805518032.py:8: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]` C:\Users\ASUS\AppData\Local\Temp\ipykernel_3156\805518032.py:10: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
In [7]:
bar('Contract')
bar('PaperlessBilling')
bar('PaymentMethod')
C:\Users\ASUS\AppData\Local\Temp\ipykernel_3156\805518032.py:8: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]` C:\Users\ASUS\AppData\Local\Temp\ipykernel_3156\805518032.py:10: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
C:\Users\ASUS\AppData\Local\Temp\ipykernel_3156\805518032.py:8: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]` C:\Users\ASUS\AppData\Local\Temp\ipykernel_3156\805518032.py:10: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
C:\Users\ASUS\AppData\Local\Temp\ipykernel_3156\805518032.py:8: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]` C:\Users\ASUS\AppData\Local\Temp\ipykernel_3156\805518032.py:10: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
In [8]:
data_df.dtypes
Out[8]:
customerID object gender object SeniorCitizen object Partner object Dependents object tenure int64 PhoneService object MultipleLines object InternetService object OnlineSecurity object OnlineBackup object DeviceProtection object TechSupport object StreamingTV object StreamingMovies object Contract object PaperlessBilling object PaymentMethod object MonthlyCharges float64 TotalCharges object Churn object dtype: object
In [9]:
# Let’s catch the error
try:
data_df['TotalCharges'] = data_df['TotalCharges'].astype(float)
except ValueError as ve:
print (ve)
could not convert string to float: ' '
In [10]:
data_df['TotalCharges'] = pd.to_numeric(data_df['TotalCharges'],errors='coerce')
#Fill the missing values with with the median value
data_df['TotalCharges'] = data_df['TotalCharges'].fillna(data_df['TotalCharges'].median())
In [13]:
# Defining the histogram plotting function
def hist(feature):
group_df = data_df.groupby([feature, 'Churn']).size().reset_index()
group_df = group_df.rename(columns={0: 'Count'})
fig = px.histogram(group_df, x=feature, y='Count', color='Churn', marginal='box', title=f'Churn rate frequency to {feature} distribution', color_discrete_sequence=["blue", "pink"])
fig.show()
In [14]:
hist('tenure')
hist('MonthlyCharges')
hist('TotalCharges')
In [15]:
#Create an empty dataframe
bin_df = pd.DataFrame()
#Update the binning dataframe
bin_df['tenure_bins'] = pd.qcut(data_df['tenure'], q=3, labels= ['low', 'medium', 'high'])
bin_df['MonthlyCharges_bins'] = pd.qcut(data_df['MonthlyCharges'], q=3, labels= ['low', 'medium', 'high'])
bin_df['TotalCharges_bins'] = pd.qcut(data_df['TotalCharges'], q=3, labels= ['low', 'medium', 'high'])
bin_df['Churn'] = data_df['Churn']
#Plot the bar chart of the binned variables
bar('tenure_bins', bin_df)
bar('MonthlyCharges_bins', bin_df)
bar('TotalCharges_bins', bin_df)
C:\Users\ASUS\AppData\Local\Temp\ipykernel_3156\805518032.py:4: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. C:\Users\ASUS\AppData\Local\Temp\ipykernel_3156\805518032.py:8: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]` C:\Users\ASUS\AppData\Local\Temp\ipykernel_3156\805518032.py:10: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
C:\Users\ASUS\AppData\Local\Temp\ipykernel_3156\805518032.py:4: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. C:\Users\ASUS\AppData\Local\Temp\ipykernel_3156\805518032.py:8: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]` C:\Users\ASUS\AppData\Local\Temp\ipykernel_3156\805518032.py:10: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
C:\Users\ASUS\AppData\Local\Temp\ipykernel_3156\805518032.py:4: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. C:\Users\ASUS\AppData\Local\Temp\ipykernel_3156\805518032.py:8: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]` C:\Users\ASUS\AppData\Local\Temp\ipykernel_3156\805518032.py:10: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
In [16]:
# The customerID column isnt useful as the feature is used for identification of customers.
data_df.drop(["customerID"],axis=1,inplace = True)
# Encode categorical features
#Defining the map function
def binary_map(feature):
return feature.map({'Yes':1, 'No':0})
## Encoding target feature
data_df['Churn'] = data_df[['Churn']].apply(binary_map)
# Encoding gender category
data_df['gender'] = data_df['gender'].map({'Male':1, 'Female':0})
#Encoding other binary category
binary_list = ['SeniorCitizen', 'Partner', 'Dependents', 'PhoneService', 'PaperlessBilling']
data_df[binary_list] = data_df[binary_list].apply(binary_map)
#Encoding the other categoric features with more than two categories
data_df = pd.get_dummies(data_df, drop_first=True)
In [19]:
# Checking the correlation between features
corr = data_df.corr()
fig = px.imshow(corr,width=1000, height=1000)
fig.show()
In [20]:
import statsmodels.api as sm
import statsmodels.formula.api as smf
#Change variable name separators to '_'
all_columns = [column.replace(" ", "_").replace("(", "_").replace(")", "_").replace("-", "_") for column in data_df.columns]
#Effect the change to the dataframe column names
data_df.columns = all_columns
#Prepare it for the GLM formula
glm_columns = [e for e in all_columns if e not in ['customerID', 'Churn']]
glm_columns = ' + '.join(map(str, glm_columns))
#Fiting it to the Generalized Linear Model
glm_model = smf.glm(formula=f'Churn ~ {glm_columns}', data=data_df, family=sm.families.Binomial())
res = glm_model.fit()
print(res.summary())
Generalized Linear Model Regression Results
==============================================================================
Dep. Variable: Churn No. Observations: 7043
Model: GLM Df Residuals: 7019
Model Family: Binomial Df Model: 23
Link Function: Logit Scale: 1.0000
Method: IRLS Log-Likelihood: -2914.7
Date: Thu, 15 Aug 2024 Deviance: 5829.3
Time: 13:08:44 Pearson chi2: 8.04e+03
No. Iterations: 7 Pseudo R-squ. (CS): 0.2807
Covariance Type: nonrobust
=================================================================================================================
coef std err z P>|z| [0.025 0.975]
-----------------------------------------------------------------------------------------------------------------
Intercept 0.8274 0.748 1.106 0.269 -0.639 2.294
MultipleLines_No_phone_service[T.True] 0.3238 0.106 3.061 0.002 0.116 0.531
MultipleLines_Yes[T.True] 0.4469 0.177 2.524 0.012 0.100 0.794
InternetService_Fiber_optic[T.True] 1.7530 0.798 2.198 0.028 0.190 3.316
InternetService_No[T.True] -0.2559 0.115 -2.220 0.026 -0.482 -0.030
OnlineSecurity_No_internet_service[T.True] -0.2559 0.115 -2.220 0.026 -0.482 -0.030
OnlineSecurity_Yes[T.True] -0.2055 0.179 -1.150 0.250 -0.556 0.145
OnlineBackup_No_internet_service[T.True] -0.2559 0.115 -2.220 0.026 -0.482 -0.030
OnlineBackup_Yes[T.True] 0.0258 0.175 0.147 0.883 -0.318 0.369
DeviceProtection_No_internet_service[T.True] -0.2559 0.115 -2.220 0.026 -0.482 -0.030
DeviceProtection_Yes[T.True] 0.1477 0.176 0.838 0.402 -0.198 0.493
TechSupport_No_internet_service[T.True] -0.2559 0.115 -2.220 0.026 -0.482 -0.030
TechSupport_Yes[T.True] -0.1789 0.180 -0.991 0.322 -0.533 0.175
StreamingTV_No_internet_service[T.True] -0.2559 0.115 -2.220 0.026 -0.482 -0.030
StreamingTV_Yes[T.True] 0.5912 0.326 1.813 0.070 -0.048 1.230
StreamingMovies_No_internet_service[T.True] -0.2559 0.115 -2.220 0.026 -0.482 -0.030
StreamingMovies_Yes[T.True] 0.6038 0.326 1.850 0.064 -0.036 1.244
Contract_One_year[T.True] -0.6671 0.107 -6.208 0.000 -0.878 -0.456
Contract_Two_year[T.True] -1.3896 0.176 -7.904 0.000 -1.734 -1.045
PaymentMethod_Credit_card__automatic_[T.True] -0.0865 0.114 -0.758 0.448 -0.310 0.137
PaymentMethod_Electronic_check[T.True] 0.3057 0.094 3.236 0.001 0.121 0.491
PaymentMethod_Mailed_check[T.True] -0.0567 0.115 -0.493 0.622 -0.282 0.168
gender -0.0219 0.065 -0.338 0.736 -0.149 0.105
SeniorCitizen 0.2151 0.085 2.545 0.011 0.049 0.381
Partner -0.0027 0.078 -0.035 0.972 -0.155 0.150
Dependents -0.1538 0.090 -1.714 0.087 -0.330 0.022
tenure -0.0594 0.006 -9.649 0.000 -0.071 -0.047
PhoneService 0.5036 0.692 0.728 0.467 -0.852 1.860
PaperlessBilling 0.3418 0.074 4.590 0.000 0.196 0.488
MonthlyCharges -0.0404 0.032 -1.272 0.203 -0.103 0.022
TotalCharges 0.0003 7.01e-05 4.543 0.000 0.000 0.000
=================================================================================================================
In [21]:
np.exp(res.params)
Out[21]:
Intercept 2.287343 MultipleLines_No_phone_service[T.True] 1.382358 MultipleLines_Yes[T.True] 1.563475 InternetService_Fiber_optic[T.True] 5.771657 InternetService_No[T.True] 0.774257 OnlineSecurity_No_internet_service[T.True] 0.774257 OnlineSecurity_Yes[T.True] 0.814269 OnlineBackup_No_internet_service[T.True] 0.774257 OnlineBackup_Yes[T.True] 1.026127 DeviceProtection_No_internet_service[T.True] 0.774257 DeviceProtection_Yes[T.True] 1.159152 TechSupport_No_internet_service[T.True] 0.774257 TechSupport_Yes[T.True] 0.836193 StreamingTV_No_internet_service[T.True] 0.774257 StreamingTV_Yes[T.True] 1.806134 StreamingMovies_No_internet_service[T.True] 0.774257 StreamingMovies_Yes[T.True] 1.829067 Contract_One_year[T.True] 0.513185 Contract_Two_year[T.True] 0.249179 PaymentMethod_Credit_card__automatic_[T.True] 0.917142 PaymentMethod_Electronic_check[T.True] 1.357617 PaymentMethod_Mailed_check[T.True] 0.944913 gender 0.978355 SeniorCitizen 1.239957 Partner 0.997312 Dependents 0.857471 tenure 0.942322 PhoneService 1.654668 PaperlessBilling 1.407543 MonthlyCharges 0.960432 TotalCharges 1.000318 dtype: float64
In [22]:
#feature scaling
from sklearn.preprocessing import MinMaxScaler
sc = MinMaxScaler()
data_df['tenure'] = sc.fit_transform(data_df[['tenure']])
data_df['MonthlyCharges'] = sc.fit_transform(data_df[['MonthlyCharges']])
data_df['TotalCharges'] = sc.fit_transform(data_df[['TotalCharges']])
In [23]:
# Import Machine learning algorithms
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
#Import metric for performance evaluation
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
#Split data into train and test sets
from sklearn.model_selection import train_test_split
X = data_df.drop('Churn', axis=1)
y = data_df['Churn']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=50)
#Defining the modelling function
def modeling(alg, alg_name, params={}):
model = alg(**params) #Instantiating the algorithm class and unpacking parameters if any
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
#Performance evaluation
def print_scores(alg, y_true, y_pred):
print(alg_name)
acc_score = accuracy_score(y_true, y_pred)
print("accuracy: ",acc_score)
pre_score = precision_score(y_true, y_pred)
print("precision: ",pre_score)
rec_score = recall_score(y_true, y_pred)
print("recall: ",rec_score)
f_score = f1_score(y_true, y_pred, average='weighted')
print("f1_score: ",f_score)
print_scores(alg, y_test, y_pred)
return model
# Running logistic regression model
log_model = modeling(LogisticRegression, 'Logistic Regression')
Logistic Regression accuracy: 0.7979176526265973 precision: 0.6274509803921569 recall: 0.5745062836624776 f1_score: 0.7949702200781946
In [24]:
# Feature selection to improve model building
from sklearn.feature_selection import RFECV
from sklearn.model_selection import StratifiedKFold
log = LogisticRegression()
rfecv = RFECV(estimator=log, cv=StratifiedKFold(10, random_state=50, shuffle=True), scoring="accuracy")
rfecv.fit(X, y)
Out[24]:
RFECV(cv=StratifiedKFold(n_splits=10, random_state=50, shuffle=True),
estimator=LogisticRegression(), scoring='accuracy')In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
RFECV(cv=StratifiedKFold(n_splits=10, random_state=50, shuffle=True),
estimator=LogisticRegression(), scoring='accuracy')LogisticRegression()
LogisticRegression()
In [27]:
import matplotlib.pyplot as plt
# Plotting the graph for Recursive Feature Elimination (RFE)
plt.figure(figsize=(8, 6))
# Plot the mean cross-validation score vs. number of features
# Accessing mean test scores from cv_results_
plt.plot(range(1, len(rfecv.cv_results_['mean_test_score']) + 1),
rfecv.cv_results_['mean_test_score'], marker='o')
# Adding grid for better readability
plt.grid(True)
# Set x-axis ticks to match the number of features
plt.xticks(range(1, X.shape[1] + 1))
# Labeling the axes
plt.xlabel("Number of Selected Features")
plt.ylabel("Cross-Validation Score (CV Score)")
# Adding a title to the plot
plt.title("Recursive Feature Elimination (RFE)")
# Display the plot
plt.show()
# Print the optimal number of features
print("The optimal number of features: {}".format(rfecv.n_features_))
The optimal number of features: 25
In [29]:
# Saving dataframe with optimal features
X_rfe = X.iloc[:, rfecv.support_]
# Overview of the optimal features in comparison with the initial dataframe
print("X dimension: {}".format(X.shape))
print("X column list:", X.columns.tolist())
print("X_rfe dimension: {}".format(X_rfe.shape))
print("X_rfe column list:", X_rfe.columns.tolist())
X dimension: (7043, 30) X column list: ['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'tenure', 'PhoneService', 'PaperlessBilling', 'MonthlyCharges', 'TotalCharges', 'MultipleLines_No_phone_service', 'MultipleLines_Yes', 'InternetService_Fiber_optic', 'InternetService_No', 'OnlineSecurity_No_internet_service', 'OnlineSecurity_Yes', 'OnlineBackup_No_internet_service', 'OnlineBackup_Yes', 'DeviceProtection_No_internet_service', 'DeviceProtection_Yes', 'TechSupport_No_internet_service', 'TechSupport_Yes', 'StreamingTV_No_internet_service', 'StreamingTV_Yes', 'StreamingMovies_No_internet_service', 'StreamingMovies_Yes', 'Contract_One_year', 'Contract_Two_year', 'PaymentMethod_Credit_card__automatic_', 'PaymentMethod_Electronic_check', 'PaymentMethod_Mailed_check'] X_rfe dimension: (7043, 25) X_rfe column list: ['SeniorCitizen', 'Dependents', 'tenure', 'PhoneService', 'PaperlessBilling', 'TotalCharges', 'MultipleLines_No_phone_service', 'MultipleLines_Yes', 'InternetService_Fiber_optic', 'InternetService_No', 'OnlineSecurity_No_internet_service', 'OnlineSecurity_Yes', 'OnlineBackup_No_internet_service', 'OnlineBackup_Yes', 'DeviceProtection_No_internet_service', 'TechSupport_No_internet_service', 'TechSupport_Yes', 'StreamingTV_No_internet_service', 'StreamingTV_Yes', 'StreamingMovies_No_internet_service', 'StreamingMovies_Yes', 'Contract_One_year', 'Contract_Two_year', 'PaymentMethod_Credit_card__automatic_', 'PaymentMethod_Electronic_check']
In [31]:
# Splitting data with optimal features
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
X_train, X_test, y_train, y_test = train_test_split(X_rfe, y, test_size=0.3, random_state=50)
# Running logistic regression model
log_model = LogisticRegression()
log_model.fit(X_train, y_train)
# Evaluate the model
y_pred = log_model.predict(X_test)
# You can add evaluation metrics here, for example:
from sklearn.metrics import accuracy_score, classification_report
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))
Accuracy: 0.8012304779933743
Classification Report:
precision recall f1-score support
0 0.85 0.88 0.87 1556
1 0.64 0.57 0.60 557
accuracy 0.80 2113
macro avg 0.74 0.73 0.74 2113
weighted avg 0.80 0.80 0.80 2113
In [32]:
### Trying other machine learning algorithms: SVC
svc_model = modeling(SVC, 'SVC Classification')
SVC Classification accuracy: 0.7960246095598675 precision: 0.6431818181818182 recall: 0.5080789946140036 f1_score: 0.7877515790466652
In [33]:
#Random forest
rf_model = modeling(RandomForestClassifier, "Random Forest Classification")
Random Forest Classification accuracy: 0.7860861334595362 precision: 0.6164079822616408 recall: 0.4991023339317774 f1_score: 0.7783618089789716
In [34]:
#Decision tree
dt_model = modeling(DecisionTreeClassifier, "Decision Tree Classification")
Decision Tree Classification accuracy: 0.7335541883577852 precision: 0.49491525423728816 recall: 0.5242369838420108 f1_score: 0.7359592535740278
In [35]:
#Naive bayes
nb_model = modeling(GaussianNB, "Naive Bayes Classification")
Naive Bayes Classification accuracy: 0.6436346426881212 precision: 0.41792294807370184 recall: 0.895870736086176 f1_score: 0.6625965549469691
In [44]:
## Improve best model by hyperparameter tuning
# define model
model = LogisticRegression()
# define evaluation
from sklearn.model_selection import RepeatedStratifiedKFold
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
# define search space
from scipy.stats import loguniform
space = dict()
space['solver'] = ['newton-cg', 'lbfgs', 'liblinear']
space['penalty'] = ['none', 'l1', 'l2', 'elasticnet']
space['C'] = loguniform(1e-5, 1000)
# define search
from sklearn.model_selection import RandomizedSearchCV
search = RandomizedSearchCV(model, space, n_iter=500, scoring='accuracy', n_jobs=-1, cv=cv, random_state=1)
# execute search
result = search.fit(X_rfe, y)
# summarize result
# print('Best Score: %s' % result.best_score_)
# print('Best Hyperparameters: %s' % result.best_params_)
params = result.best_params_
#Improving the Logistic Regression model
log_model = modeling(LogisticRegression, 'Logistic Regression Classification', params=params)
Logistic Regression Classification accuracy: 0.8007572172266919 precision: 0.6349206349206349 recall: 0.5745062836624776 f1_score: 0.7974490678620106
C:\Users\ASUS\AppData\Roaming\Python\Python312\site-packages\sklearn\model_selection\_validation.py:540: FitFailedWarning:
9390 fits failed out of a total of 15000.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.
Below are more details about the failures:
--------------------------------------------------------------------------------
364 fits failed with the following error:
Traceback (most recent call last):
File "C:\Users\ASUS\AppData\Roaming\Python\Python312\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
estimator.fit(X_train, y_train, **fit_params)
File "C:\Users\ASUS\AppData\Roaming\Python\Python312\site-packages\sklearn\base.py", line 1466, in wrapper
estimator._validate_params()
File "C:\Users\ASUS\AppData\Roaming\Python\Python312\site-packages\sklearn\base.py", line 666, in _validate_params
validate_parameter_constraints(
File "C:\Users\ASUS\AppData\Roaming\Python\Python312\site-packages\sklearn\utils\_param_validation.py", line 95, in validate_parameter_constraints
raise InvalidParameterError(
sklearn.utils._param_validation.InvalidParameterError: The 'penalty' parameter of LogisticRegression must be a str among {'l2', 'elasticnet', 'l1'} or None. Got 'none' instead.
--------------------------------------------------------------------------------
598 fits failed with the following error:
Traceback (most recent call last):
File "C:\Users\ASUS\AppData\Roaming\Python\Python312\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
estimator.fit(X_train, y_train, **fit_params)
File "C:\Users\ASUS\AppData\Roaming\Python\Python312\site-packages\sklearn\base.py", line 1466, in wrapper
estimator._validate_params()
File "C:\Users\ASUS\AppData\Roaming\Python\Python312\site-packages\sklearn\base.py", line 666, in _validate_params
validate_parameter_constraints(
File "C:\Users\ASUS\AppData\Roaming\Python\Python312\site-packages\sklearn\utils\_param_validation.py", line 95, in validate_parameter_constraints
raise InvalidParameterError(
sklearn.utils._param_validation.InvalidParameterError: The 'penalty' parameter of LogisticRegression must be a str among {'elasticnet', 'l2', 'l1'} or None. Got 'none' instead.
--------------------------------------------------------------------------------
1661 fits failed with the following error:
Traceback (most recent call last):
File "C:\Users\ASUS\AppData\Roaming\Python\Python312\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
estimator.fit(X_train, y_train, **fit_params)
File "C:\Users\ASUS\AppData\Roaming\Python\Python312\site-packages\sklearn\base.py", line 1466, in wrapper
estimator._validate_params()
File "C:\Users\ASUS\AppData\Roaming\Python\Python312\site-packages\sklearn\base.py", line 666, in _validate_params
validate_parameter_constraints(
File "C:\Users\ASUS\AppData\Roaming\Python\Python312\site-packages\sklearn\utils\_param_validation.py", line 95, in validate_parameter_constraints
raise InvalidParameterError(
sklearn.utils._param_validation.InvalidParameterError: The 'penalty' parameter of LogisticRegression must be a str among {'l1', 'l2', 'elasticnet'} or None. Got 'none' instead.
--------------------------------------------------------------------------------
345 fits failed with the following error:
Traceback (most recent call last):
File "C:\Users\ASUS\AppData\Roaming\Python\Python312\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
estimator.fit(X_train, y_train, **fit_params)
File "C:\Users\ASUS\AppData\Roaming\Python\Python312\site-packages\sklearn\base.py", line 1466, in wrapper
estimator._validate_params()
File "C:\Users\ASUS\AppData\Roaming\Python\Python312\site-packages\sklearn\base.py", line 666, in _validate_params
validate_parameter_constraints(
File "C:\Users\ASUS\AppData\Roaming\Python\Python312\site-packages\sklearn\utils\_param_validation.py", line 95, in validate_parameter_constraints
raise InvalidParameterError(
sklearn.utils._param_validation.InvalidParameterError: The 'penalty' parameter of LogisticRegression must be a str among {'elasticnet', 'l1', 'l2'} or None. Got 'none' instead.
--------------------------------------------------------------------------------
572 fits failed with the following error:
Traceback (most recent call last):
File "C:\Users\ASUS\AppData\Roaming\Python\Python312\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
estimator.fit(X_train, y_train, **fit_params)
File "C:\Users\ASUS\AppData\Roaming\Python\Python312\site-packages\sklearn\base.py", line 1466, in wrapper
estimator._validate_params()
File "C:\Users\ASUS\AppData\Roaming\Python\Python312\site-packages\sklearn\base.py", line 666, in _validate_params
validate_parameter_constraints(
File "C:\Users\ASUS\AppData\Roaming\Python\Python312\site-packages\sklearn\utils\_param_validation.py", line 95, in validate_parameter_constraints
raise InvalidParameterError(
sklearn.utils._param_validation.InvalidParameterError: The 'penalty' parameter of LogisticRegression must be a str among {'l1', 'elasticnet', 'l2'} or None. Got 'none' instead.
--------------------------------------------------------------------------------
1590 fits failed with the following error:
Traceback (most recent call last):
File "C:\Users\ASUS\AppData\Roaming\Python\Python312\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
estimator.fit(X_train, y_train, **fit_params)
File "C:\Users\ASUS\AppData\Roaming\Python\Python312\site-packages\sklearn\base.py", line 1473, in wrapper
return fit_method(estimator, *args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "C:\Users\ASUS\AppData\Roaming\Python\Python312\site-packages\sklearn\linear_model\_logistic.py", line 1194, in fit
solver = _check_solver(self.solver, self.penalty, self.dual)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "C:\Users\ASUS\AppData\Roaming\Python\Python312\site-packages\sklearn\linear_model\_logistic.py", line 67, in _check_solver
raise ValueError(
ValueError: Solver lbfgs supports only 'l2' or None penalties, got elasticnet penalty.
--------------------------------------------------------------------------------
1140 fits failed with the following error:
Traceback (most recent call last):
File "C:\Users\ASUS\AppData\Roaming\Python\Python312\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
estimator.fit(X_train, y_train, **fit_params)
File "C:\Users\ASUS\AppData\Roaming\Python\Python312\site-packages\sklearn\base.py", line 1473, in wrapper
return fit_method(estimator, *args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "C:\Users\ASUS\AppData\Roaming\Python\Python312\site-packages\sklearn\linear_model\_logistic.py", line 1194, in fit
solver = _check_solver(self.solver, self.penalty, self.dual)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "C:\Users\ASUS\AppData\Roaming\Python\Python312\site-packages\sklearn\linear_model\_logistic.py", line 67, in _check_solver
raise ValueError(
ValueError: Solver newton-cg supports only 'l2' or None penalties, got l1 penalty.
--------------------------------------------------------------------------------
960 fits failed with the following error:
Traceback (most recent call last):
File "C:\Users\ASUS\AppData\Roaming\Python\Python312\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
estimator.fit(X_train, y_train, **fit_params)
File "C:\Users\ASUS\AppData\Roaming\Python\Python312\site-packages\sklearn\base.py", line 1473, in wrapper
return fit_method(estimator, *args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "C:\Users\ASUS\AppData\Roaming\Python\Python312\site-packages\sklearn\linear_model\_logistic.py", line 1194, in fit
solver = _check_solver(self.solver, self.penalty, self.dual)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "C:\Users\ASUS\AppData\Roaming\Python\Python312\site-packages\sklearn\linear_model\_logistic.py", line 67, in _check_solver
raise ValueError(
ValueError: Solver lbfgs supports only 'l2' or None penalties, got l1 penalty.
--------------------------------------------------------------------------------
1110 fits failed with the following error:
Traceback (most recent call last):
File "C:\Users\ASUS\AppData\Roaming\Python\Python312\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
estimator.fit(X_train, y_train, **fit_params)
File "C:\Users\ASUS\AppData\Roaming\Python\Python312\site-packages\sklearn\base.py", line 1473, in wrapper
return fit_method(estimator, *args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "C:\Users\ASUS\AppData\Roaming\Python\Python312\site-packages\sklearn\linear_model\_logistic.py", line 1194, in fit
solver = _check_solver(self.solver, self.penalty, self.dual)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "C:\Users\ASUS\AppData\Roaming\Python\Python312\site-packages\sklearn\linear_model\_logistic.py", line 75, in _check_solver
raise ValueError(
ValueError: Only 'saga' solver supports elasticnet penalty, got solver=liblinear.
--------------------------------------------------------------------------------
1050 fits failed with the following error:
Traceback (most recent call last):
File "C:\Users\ASUS\AppData\Roaming\Python\Python312\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
estimator.fit(X_train, y_train, **fit_params)
File "C:\Users\ASUS\AppData\Roaming\Python\Python312\site-packages\sklearn\base.py", line 1473, in wrapper
return fit_method(estimator, *args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "C:\Users\ASUS\AppData\Roaming\Python\Python312\site-packages\sklearn\linear_model\_logistic.py", line 1194, in fit
solver = _check_solver(self.solver, self.penalty, self.dual)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "C:\Users\ASUS\AppData\Roaming\Python\Python312\site-packages\sklearn\linear_model\_logistic.py", line 67, in _check_solver
raise ValueError(
ValueError: Solver newton-cg supports only 'l2' or None penalties, got elasticnet penalty.
C:\Users\ASUS\AppData\Roaming\Python\Python312\site-packages\sklearn\model_selection\_search.py:1102: UserWarning:
One or more of the test scores are non-finite: [ nan nan nan nan 0.79956312 0.80117236
0.73463008 0.73463008 nan nan nan nan
nan nan 0.80443746 nan nan nan
0.73463008 nan 0.80443773 0.80230812 nan 0.73463008
0.8043427 nan nan nan nan 0.8043431
0.73463008 nan nan nan nan 0.80443766
nan nan 0.78413557 nan 0.80358599 0.80439025
0.78342602 0.79842716 nan 0.73463008 nan nan
nan nan nan nan nan nan
nan nan nan nan nan nan
0.80453229 nan 0.80410656 0.80121964 nan 0.80027254
nan 0.80443746 nan 0.73463008 nan nan
nan nan nan 0.80448494 nan 0.80282855
0.73463008 nan nan 0.80434303 0.73463008 nan
0.73463008 nan nan 0.78455936 nan nan
0.73463008 nan nan 0.8033021 nan nan
nan nan 0.73463008 nan nan nan
nan 0.79378889 0.80334925 nan 0.80273419 nan
0.73463008 0.80140904 nan nan 0.80334912 nan
nan 0.80443746 nan nan nan nan
0.80443746 0.80420125 nan nan 0.73463008 0.73463008
nan nan 0.79322064 nan 0.73463008 0.80448467
0.80443766 0.73463008 0.73463008 nan nan 0.80453229
0.80448494 nan 0.80315979 nan 0.80439011 nan
0.73463008 nan nan nan 0.73463008 nan
nan 0.73463008 0.73463008 nan nan nan
nan nan 0.80453222 0.73463008 0.80405907 nan
nan nan nan 0.80268711 nan nan
nan 0.80448474 0.80268684 nan nan 0.80434303
0.79757488 nan nan 0.73463008 nan nan
0.80434303 nan 0.73463008 nan nan 0.73463008
0.8043427 nan nan nan nan 0.74811802
0.80358586 0.80344388 nan 0.79672308 nan nan
nan nan nan 0.73463008 nan 0.73463008
nan nan nan 0.80429588 0.80448467 0.80457964
nan 0.80453236 nan nan nan nan
0.80107746 nan nan nan 0.73463008 nan
nan nan nan nan 0.7975276 nan
0.80358599 nan 0.73463008 0.79880628 nan nan
0.80192933 0.73463008 nan nan nan nan
nan 0.80415384 nan nan 0.80448474 nan
0.80368056 0.73463008 nan nan nan nan
0.80325455 nan nan 0.73463008 nan 0.79601433
nan nan nan 0.73463008 0.73463008 0.80330197
0.80448494 nan 0.80306496 0.73463008 nan 0.80121958
nan nan 0.73463008 nan nan nan
nan nan 0.80297107 0.802971 0.80405928 nan
nan nan nan 0.80439038 nan 0.73463008
nan nan 0.7872593 0.75971544 nan nan
nan nan nan 0.80316033 0.79667614 nan
nan 0.80306529 nan nan nan 0.73463008
nan nan 0.80453243 nan nan 0.73463008
nan 0.80301835 nan 0.80443753 nan 0.80420125
nan nan 0.80443746 nan nan 0.78451221
nan 0.80372784 0.73463008 nan 0.80420125 0.80325435
nan 0.73463008 nan nan nan nan
nan 0.80429575 nan nan nan 0.73463008
0.80457964 nan nan nan 0.80462692 nan
0.80358593 nan nan nan 0.73463008 0.73463008
nan 0.8042486 nan 0.73463008 0.73463008 0.80443746
nan nan 0.73463008 0.79984707 nan 0.73463008
nan 0.80292318 0.79236783 nan nan 0.80339633
nan 0.8011725 0.80268698 nan 0.80415397 nan
nan nan nan nan nan 0.80306503
nan 0.80088834 nan nan 0.80448494 nan
nan nan nan 0.80448474 nan nan
0.80339667 0.77504708 nan nan nan nan
0.73463008 0.73463008 0.80434303 0.73463008 nan nan
0.73463008 0.73463008 0.79113986 nan nan nan
nan 0.73463008 0.80306543 nan 0.73463008 0.73463008
nan nan 0.80448481 nan 0.7885371 nan
nan 0.80282902 nan nan nan nan
0.73463008 nan nan nan nan nan
nan nan 0.79937386 0.78337605 nan 0.80443746
0.73463008 nan nan nan 0.73524547 nan
nan nan 0.73463008 nan nan 0.76245856
nan nan 0.80457964 nan nan nan
nan nan nan nan nan 0.8044378
nan nan nan nan nan nan
nan 0.80358572 0.80301774 nan 0.79762217 nan
nan nan nan 0.80434303 nan nan
nan nan 0.80448481 nan nan nan
0.80278181 nan 0.80330136 nan 0.80462692 nan
nan nan]
In [40]:
#Saving best model
import joblib
#Sava the model to disk
filename = 'model.sav'
joblib.dump(log_model, filename)
Out[40]:
['model.sav']
In [ ]: